import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.metrics import classification_report, confusion_matrix
data = 'heartfailure.csv'
df= pd.read_csv(data)
df.head()
| age | anaemia | creatinine_phosphokinase | diabetes | ejection_fraction | high_blood_pressure | platelets | serum_creatinine | serum_sodium | sex | smoking | time | DEATH_EVENT | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 75.0 | 0 | 582 | 0 | 20 | 1 | 265000.00 | 1.9 | 130 | 1 | 0 | 4 | 1 |
| 1 | 55.0 | 0 | 7861 | 0 | 38 | 0 | 263358.03 | 1.1 | 136 | 1 | 0 | 6 | 1 |
| 2 | 65.0 | 0 | 146 | 0 | 20 | 0 | 162000.00 | 1.3 | 129 | 1 | 1 | 7 | 1 |
| 3 | 50.0 | 1 | 111 | 0 | 20 | 0 | 210000.00 | 1.9 | 137 | 1 | 0 | 7 | 1 |
| 4 | 65.0 | 1 | 160 | 1 | 20 | 0 | 327000.00 | 2.7 | 116 | 0 | 0 | 8 | 1 |
df.isnull().sum() #no missing value
age 0 anaemia 0 creatinine_phosphokinase 0 diabetes 0 ejection_fraction 0 high_blood_pressure 0 platelets 0 serum_creatinine 0 serum_sodium 0 sex 0 smoking 0 time 0 DEATH_EVENT 0 dtype: int64
df.shape #299 observations and 13 features
(299, 13)
df.columns
Index(['age', 'anaemia', 'creatinine_phosphokinase', 'diabetes',
'ejection_fraction', 'high_blood_pressure', 'platelets',
'serum_creatinine', 'serum_sodium', 'sex', 'smoking', 'time',
'DEATH_EVENT'],
dtype='object')
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 299 entries, 0 to 298 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 299 non-null float64 1 anaemia 299 non-null int64 2 creatinine_phosphokinase 299 non-null int64 3 diabetes 299 non-null int64 4 ejection_fraction 299 non-null int64 5 high_blood_pressure 299 non-null int64 6 platelets 299 non-null float64 7 serum_creatinine 299 non-null float64 8 serum_sodium 299 non-null int64 9 sex 299 non-null int64 10 smoking 299 non-null int64 11 time 299 non-null int64 12 DEATH_EVENT 299 non-null int64 dtypes: float64(3), int64(10) memory usage: 30.5 KB
#to check the datatype
df.dtypes
age float64 anaemia int64 creatinine_phosphokinase int64 diabetes int64 ejection_fraction int64 high_blood_pressure int64 platelets float64 serum_creatinine float64 serum_sodium int64 sex int64 smoking int64 time int64 DEATH_EVENT int64 dtype: object
#Let's compare the death (1) and survival (0)
death_rate = df.DEATH_EVENT.value_counts()/299
death_rate
0 0.67893 1 0.32107 Name: DEATH_EVENT, dtype: float64
#About 68% survived while 32% died from heart failure
sns.countplot(x="DEATH_EVENT", data = df)
plt.title('Distribution of Heart Failure')
plt.show()
#203 survive 96 death
#Analysing age feature
df['age'].min(), df['age'].mean() , df['age'].max()
(40.0, 60.83389297658862, 95.0)
#The minimim age we have for any patient is 40, mean age as 60 and maximum as 95 .
#Majority of the patient's age lies near 60-61 Years.
sns.displot(df, x="age")
plt.title('Age Distribution')
plt.show()
#We can also confirm from above plot, that the majority of patient lies at age near 60-61 Years Age.
#Finding the relationship between age and heart failure
sns.displot(df, x="age", hue = 'DEATH_EVENT')
plt.title('Age and Heart Failure')
plt.show()
#age 79 to 95 are more likely to die
#age 40 to 78 are going to survive
#Gender
sns.countplot(x="sex", data = df)
plt.title('Distribution of Sex Feature')
plt.show()
#majority of the population is female
sns.countplot(x="sex", hue = 'DEATH_EVENT', data = df)
plt.title('Sex vs Heart Disease')
plt.show()
#male has more deathevent and high survival but low in female
#highblod pressure Distribution
sns.countplot(x="high_blood_pressure", data = df)
plt.title('Distribution of high blood pressure Feature')
plt.show()
sns.displot(df, x="high_blood_pressure", hue = 'DEATH_EVENT')
plt.title('High Blood Pressure and Heart Failure')
plt.show()
sns.displot(df, x="time", hue = 'DEATH_EVENT')
plt.title('time and Heart Failure')
plt.show()
# Feature importance and Selection
plt.rcParams['figure.figsize']=10,4
sns.set_style("darkgrid")
x = df.iloc[:, :-1]
y = df.iloc[:,-1]
from sklearn.ensemble import ExtraTreesClassifier
import matplotlib.pyplot as plt
model = ExtraTreesClassifier()
model.fit(x,y)
print(model.feature_importances_)
feat_importances = pd.Series(model.feature_importances_, index=x.columns)
feat_importances.nlargest(12).plot(kind='barh')
[0.09949726 0.02733504 0.07899392 0.02788416 0.12244019 0.0207539 0.07382136 0.12237323 0.08266914 0.02624013 0.02920331 0.28878836]
<AxesSubplot:>
# We will select only features : time, ejection_fraction, serum_creatinine, age
#To check outliers
#Boxplot for ejection_fraction
sns.boxplot(x = df.ejection_fraction, color = 'teal')
plt.show()
#there are tow outliers and we can remove them
df[df['ejection_fraction']>=70]
| age | anaemia | creatinine_phosphokinase | diabetes | ejection_fraction | high_blood_pressure | platelets | serum_creatinine | serum_sodium | sex | smoking | time | DEATH_EVENT | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 64 | 45.0 | 0 | 582 | 0 | 80 | 0 | 263358.03 | 1.18 | 137 | 0 | 0 | 63 | 0 |
| 217 | 54.0 | 1 | 427 | 0 | 70 | 1 | 151000.00 | 9.00 | 137 | 0 | 0 | 196 | 1 |
#to keep our data for the age below 70
df = df[df['ejection_fraction']<70]
#let's check the outlier for time
sns.boxplot(x=df.time, color = 'teal')
plt.show()
#there is no outlier for time
#let's find the outlier for serum creatinine
# Boxplot for serum creatinine
sns.boxplot(x=df.serum_creatinine, color = 'teal')
plt.show()
#domain knowledeg shows serum creatinine vlues there are no outliers but datapoints that assist in predicting death events
import plotly.graph_objects as go
from plotly.subplots import make_subplots
d1 = df[(df["DEATH_EVENT"]==0) & (df["sex"]==1)]
d2 = df[(df["DEATH_EVENT"]==1) & (df["sex"]==1)]
d3 = df[(df["DEATH_EVENT"]==0) & (df["sex"]==0)]
d4 = df[(df["DEATH_EVENT"]==1) & (df["sex"]==0)]
label1 = ["Male","Female"]
label2 = ['Male - Survived','Male - Died', "Female - Survived", "Female - Died"]
values1 = [(len(d1)+len(d2)), (len(d3)+len(d4))]
values2 = [len(d1),len(d2),len(d3),len(d4)]
# Create subplots: use 'domain' type for Pie subplot
fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])
fig.add_trace(go.Pie(labels=label1, values=values1, name="GENDER"),
1, 1)
fig.add_trace(go.Pie(labels=label2, values=values2, name="GENDER VS DEATH_EVENT"),
1, 2)
# Use `hole` to create a donut-like pie chart
fig.update_traces(hole=.4, hoverinfo="label+percent")
fig.update_layout(
title_text="GENDER DISTRIBUTION IN THE DATASET \
GENDER VS DEATH_EVENT",
# Add annotations in the center of the donut pies.
annotations=[dict(text='GENDER', x=0.19, y=0.5, font_size=10, showarrow=False),
dict(text='GENDER VS DEATH_EVENT', x=0.84, y=0.5, font_size=9, showarrow=False)],
autosize=False,width=1200, height=500, paper_bgcolor="white")
fig.show()
#65.3% are male where 44.4% survived and 20.9% died, 34.7% female 23.6% survived and 11.1% died
import plotly.graph_objects as go
from plotly.subplots import make_subplots
d1 = df[(df["DEATH_EVENT"]==0) & (df["diabetes"]==1)]
d2 = df[(df["DEATH_EVENT"]==1) & (df["diabetes"]==1)]
d3 = df[(df["DEATH_EVENT"]==0) & (df["diabetes"]==0)]
d4 = df[(df["DEATH_EVENT"]==1) & (df["diabetes"]==0)]
label1 = ["No Diabetes","Diabetes"]
label2 = ['No Diabetes - Survived','Diabetes - Survived', "No Diabetes - Died", "Diabetes - Died"]
values1 = [(len(d1)+len(d3)), (len(d2)+len(d4))]
values2 = [len(d1),len(d2),len(d3),len(d4)]
# Create subplots: use 'domain' type for Pie subplot
fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])
fig.add_trace(go.Pie(labels=label1, values=values1, name="DIABETES"),
1, 1)
fig.add_trace(go.Pie(labels=label2, values=values2, name="DIABETES VS DEATH_EVENT"),
1, 2)
# Use `hole` to create a donut-like pie chart
fig.update_traces(hole=.4, hoverinfo="label+percent")
fig.update_layout(
title_text="DIABETES DISTRIBUTION IN THE DATASET \
DIABETES VS DEATH_EVENT",
# Add annotations in the center of the donut pies.
annotations=[dict(text='DIABETES', x=0.20, y=0.5, font_size=10, showarrow=False),
dict(text='DIABETES VS DEATH_EVENT', x=0.84, y=0.5, font_size=8, showarrow=False)],
autosize=False,width=1200, height=500, paper_bgcolor="white")
fig.show()
#hint: 57.9% are NON DIABETIC (out of which 39.4% survived and 18.5% died) and 42.1% are DIABETIC (out of which 28.6% survived and 13.5% died).
import plotly.graph_objects as go
from plotly.subplots import make_subplots
d1 = df[(df["DEATH_EVENT"]==0) & (df["anaemia"]==1)]
d2 = df[(df["DEATH_EVENT"]==1) & (df["anaemia"]==1)]
d3 = df[(df["DEATH_EVENT"]==0) & (df["anaemia"]==0)]
d4 = df[(df["DEATH_EVENT"]==1) & (df["anaemia"]==0)]
label1 = ["No Anaemia","Anaemia"]
label2 = ['No Anaemia - Survived','No Anaemia - Died', "Anaemia - Survived", "Anaemia - Died"]
values1 = [(len(d1)+len(d2)), (len(d3)+len(d4))]
values2 = [len(d1),len(d2),len(d3),len(d4)]
# Create subplots: use 'domain' type for Pie subplot
fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])
fig.add_trace(go.Pie(labels=label1, values=values1, name="ANAEMIA"),
1, 1)
fig.add_trace(go.Pie(labels=label2, values=values2, name="ANAEMIA VS DEATH_EVENT"),
1, 2)
# Use `hole` to create a donut-like pie chart
fig.update_traces(hole=.4, hoverinfo="label+percent")
fig.update_layout(
title_text="ANAEMIA DISTRIBUTION IN THE DATASET \
ANAEMIA VS DEATH_EVENT",
# Add annotations in the center of the donut pies.
annotations=[dict(text='ANAEMIA', x=0.20, y=0.5, font_size=10, showarrow=False),
dict(text='ANAEMIA VS DEATH_EVENT', x=0.84, y=0.5, font_size=8, showarrow=False)],
autosize=False,width=1200, height=500, paper_bgcolor="white")
fig.show()
#hint: 56.9% are NON ANAEMIC (out of which 40.1% survived and 16.8% died) and 43.1% are ANAEMIC (out of which 27.9% survived and 15.2% died).
import plotly.graph_objects as go
from plotly.subplots import make_subplots
d1 = df[(df["DEATH_EVENT"]==0) & (df["smoking"]==1)]
d2 = df[(df["DEATH_EVENT"]==1) & (df["smoking"]==1)]
d3 = df[(df["DEATH_EVENT"]==0) & (df["smoking"]==0)]
d4 = df[(df["DEATH_EVENT"]==1) & (df["smoking"]==0)]
label1 = ["No Smoking","Smoking"]
label2 = ['No Smoking - Survived','No Smoking - Died', "Smoking - Survived", "Smoking - Died"]
values1 = [(len(d1)+len(d2)), (len(d3)+len(d4))]
values2 = [len(d1),len(d2),len(d3),len(d4)]
# Create subplots: use 'domain' type for Pie subplot
fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])
fig.add_trace(go.Pie(labels=label1, values=values1, name="SMOKING"),
1, 1)
fig.add_trace(go.Pie(labels=label2, values=values2, name="SMOKING VS DEATH_EVENT"),
1, 2)
# Use `hole` to create a donut-like pie chart
fig.update_traces(hole=.4, hoverinfo="label+percent")
fig.update_layout(
title_text="SMOKING DISTRIBUTION IN THE DATASET \
SMOKING VS DEATH_EVENT",
# Add annotations in the center of the donut pies.
annotations=[dict(text='SMOKING', x=0.20, y=0.5, font_size=10, showarrow=False),
dict(text='SMOKING VS DEATH_EVENT', x=0.84, y=0.5, font_size=8, showarrow=False)],
autosize=False,width=1200, height=500, paper_bgcolor="white")
fig.show()
# hint: 67.7% do not SMOKE (out of which 45.8% survived and 21.9% died) and 32.3% do SMOKE (out of which 22.2% survived and 10.1% died).
import plotly.graph_objects as go
from plotly.subplots import make_subplots
d1 = df[(df["DEATH_EVENT"]==0) & (df["high_blood_pressure"]==1)]
d2 = df[(df["DEATH_EVENT"]==1) & (df["high_blood_pressure"]==1)]
d3 = df[(df["DEATH_EVENT"]==0) & (df["high_blood_pressure"]==0)]
d4 = df[(df["DEATH_EVENT"]==1) & (df["high_blood_pressure"]==0)]
label1 = ["No High BP","High BP"]
label2 = ['No High BP - Survived','No High BP - Died', "High BP - Survived", "High BP - Died"]
values1 = [(len(d1)+len(d2)), (len(d3)+len(d4))]
values2 = [len(d1),len(d2),len(d3),len(d4)]
# Create subplots: use 'domain' type for Pie subplot
fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])
fig.add_trace(go.Pie(labels=label1, values=values1, name="HIGH BP"),
1, 1)
fig.add_trace(go.Pie(labels=label2, values=values2, name="HIGH BP VS DEATH_EVENT"),
1, 2)
# Use `hole` to create a donut-like pie chart
fig.update_traces(hole=.4, hoverinfo="label+percent")
fig.update_layout(
title_text="HIGH BP DISTRIBUTION IN THE DATASET \
HIGH BP VS DEATH_EVENT",
# Add annotations in the center of the donut pies.
annotations=[dict(text='HIGH BP', x=0.20, y=0.5, font_size=10, showarrow=False),
dict(text='HIGH BP VS DEATH_EVENT', x=0.84, y=0.5, font_size=8, showarrow=False)],
autosize=False,width=1200, height=500, paper_bgcolor="white")
fig.show()
#hint: 65% do not have HIGH BLOOD PRESSURE (out of which 45.8% survived and 19.2% died) and 35% have HIGH BLOOD PRESSURE (out of which 22.2% survived and 12.8% died).
# Doing Univariate Analysis for statistical description and understanding of dispersion of data
df.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| age | 297.0 | 60.910215 | 11.892512 | 40.0 | 51.0 | 60.0 | 70.0 | 95.0 |
| anaemia | 297.0 | 0.430976 | 0.496049 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 |
| creatinine_phosphokinase | 297.0 | 582.360269 | 973.518622 | 23.0 | 115.0 | 249.0 | 582.0 | 7861.0 |
| diabetes | 297.0 | 0.420875 | 0.494533 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 |
| ejection_fraction | 297.0 | 37.835017 | 11.470426 | 14.0 | 30.0 | 38.0 | 45.0 | 65.0 |
| high_blood_pressure | 297.0 | 0.350168 | 0.477828 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 |
| platelets | 297.0 | 263736.339125 | 97915.822481 | 25100.0 | 213000.0 | 262000.0 | 304000.0 | 850000.0 |
| serum_creatinine | 297.0 | 1.368990 | 0.938731 | 0.5 | 0.9 | 1.1 | 1.4 | 9.4 |
| serum_sodium | 297.0 | 136.622896 | 4.427251 | 113.0 | 134.0 | 137.0 | 140.0 | 148.0 |
| sex | 297.0 | 0.653199 | 0.476755 | 0.0 | 0.0 | 1.0 | 1.0 | 1.0 |
| smoking | 297.0 | 0.323232 | 0.468500 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 |
| time | 297.0 | 130.265993 | 77.683871 | 4.0 | 73.0 | 115.0 | 205.0 | 285.0 |
| DEATH_EVENT | 297.0 | 0.319865 | 0.467211 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 |
df.describe()
| age | anaemia | creatinine_phosphokinase | diabetes | ejection_fraction | high_blood_pressure | platelets | serum_creatinine | serum_sodium | sex | smoking | time | DEATH_EVENT | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 297.000000 | 297.000000 | 297.000000 | 297.000000 | 297.000000 | 297.000000 | 297.000000 | 297.000000 | 297.000000 | 297.000000 | 297.000000 | 297.000000 | 297.000000 |
| mean | 60.910215 | 0.430976 | 582.360269 | 0.420875 | 37.835017 | 0.350168 | 263736.339125 | 1.368990 | 136.622896 | 0.653199 | 0.323232 | 130.265993 | 0.319865 |
| std | 11.892512 | 0.496049 | 973.518622 | 0.494533 | 11.470426 | 0.477828 | 97915.822481 | 0.938731 | 4.427251 | 0.476755 | 0.468500 | 77.683871 | 0.467211 |
| min | 40.000000 | 0.000000 | 23.000000 | 0.000000 | 14.000000 | 0.000000 | 25100.000000 | 0.500000 | 113.000000 | 0.000000 | 0.000000 | 4.000000 | 0.000000 |
| 25% | 51.000000 | 0.000000 | 115.000000 | 0.000000 | 30.000000 | 0.000000 | 213000.000000 | 0.900000 | 134.000000 | 0.000000 | 0.000000 | 73.000000 | 0.000000 |
| 50% | 60.000000 | 0.000000 | 249.000000 | 0.000000 | 38.000000 | 0.000000 | 262000.000000 | 1.100000 | 137.000000 | 1.000000 | 0.000000 | 115.000000 | 0.000000 |
| 75% | 70.000000 | 1.000000 | 582.000000 | 1.000000 | 45.000000 | 1.000000 | 304000.000000 | 1.400000 | 140.000000 | 1.000000 | 1.000000 | 205.000000 | 1.000000 |
| max | 95.000000 | 1.000000 | 7861.000000 | 1.000000 | 65.000000 | 1.000000 | 850000.000000 | 9.400000 | 148.000000 | 1.000000 | 1.000000 | 285.000000 | 1.000000 |
x = df.iloc[:, [4,7,11]].values
y = df.iloc[:,-1].values
print(x)
[[ 20. 1.9 4. ] [ 38. 1.1 6. ] [ 20. 1.3 7. ] [ 20. 1.9 7. ] [ 20. 2.7 8. ] [ 40. 2.1 8. ] [ 15. 1.2 10. ] [ 60. 1.1 10. ] [ 65. 1.5 10. ] [ 35. 9.4 10. ] [ 38. 4. 10. ] [ 25. 0.9 10. ] [ 30. 1.1 11. ] [ 38. 1.1 11. ] [ 30. 1. 12. ] [ 50. 1.3 13. ] [ 38. 0.9 14. ] [ 14. 0.8 14. ] [ 25. 1. 15. ] [ 55. 1.9 15. ] [ 25. 1.3 16. ] [ 30. 1.6 20. ] [ 35. 0.9 20. ] [ 60. 0.8 22. ] [ 30. 1.83 23. ] [ 38. 1.9 23. ] [ 40. 1. 24. ] [ 45. 1.3 26. ] [ 38. 5.8 26. ] [ 30. 1.2 26. ] [ 38. 1.83 27. ] [ 45. 3. 28. ] [ 35. 1. 28. ] [ 30. 1.2 29. ] [ 50. 1. 29. ] [ 35. 3.5 30. ] [ 50. 1. 30. ] [ 50. 1. 30. ] [ 30. 2.3 30. ] [ 38. 3. 30. ] [ 20. 1.83 31. ] [ 30. 1.2 32. ] [ 45. 1.2 33. ] [ 50. 1. 33. ] [ 60. 1.1 33. ] [ 38. 1.9 35. ] [ 25. 0.9 38. ] [ 38. 0.6 40. ] [ 20. 4.4 41. ] [ 30. 1. 42. ] [ 25. 1. 43. ] [ 20. 1.4 43. ] [ 62. 6.8 43. ] [ 50. 1. 44. ] [ 38. 2.2 45. ] [ 30. 2. 50. ] [ 35. 2.7 54. ] [ 40. 0.6 54. ] [ 20. 1.1 55. ] [ 20. 1.3 59. ] [ 25. 1. 60. ] [ 40. 2.3 60. ] [ 35. 1.1 60. ] [ 35. 1. 61. ] [ 20. 2.9 64. ] [ 15. 1.3 65. ] [ 25. 1. 65. ] [ 25. 1.2 66. ] [ 25. 1.83 67. ] [ 40. 0.8 68. ] [ 35. 0.9 71. ] [ 35. 1. 72. ] [ 50. 1.3 72. ] [ 20. 1.2 73. ] [ 20. 0.7 73. ] [ 60. 0.8 74. ] [ 40. 1.2 74. ] [ 38. 0.6 74. ] [ 45. 0.9 74. ] [ 40. 1.7 75. ] [ 50. 1.18 76. ] [ 25. 2.5 77. ] [ 50. 1.8 78. ] [ 25. 1. 78. ] [ 50. 0.7 79. ] [ 35. 1.1 79. ] [ 60. 0.8 79. ] [ 40. 0.7 79. ] [ 25. 1.1 79. ] [ 45. 0.8 80. ] [ 45. 1. 80. ] [ 60. 1.18 82. ] [ 25. 1.7 82. ] [ 38. 0.7 83. ] [ 60. 1. 83. ] [ 25. 1.3 83. ] [ 60. 1.1 85. ] [ 25. 1.2 85. ] [ 40. 1.1 86. ] [ 25. 1.1 87. ] [ 45. 1.18 87. ] [ 25. 1.1 87. ] [ 30. 1. 87. ] [ 50. 2.3 87. ] [ 30. 1.7 88. ] [ 45. 1.3 88. ] [ 35. 0.9 88. ] [ 38. 1.1 88. ] [ 35. 1.3 88. ] [ 60. 1.2 90. ] [ 35. 1.2 90. ] [ 25. 1.6 90. ] [ 60. 1.3 90. ] [ 40. 1.2 91. ] [ 40. 1. 91. ] [ 60. 0.7 94. ] [ 60. 3.2 94. ] [ 60. 0.9 94. ] [ 38. 1.83 95. ] [ 60. 1.5 95. ] [ 38. 1. 95. ] [ 38. 0.75 95. ] [ 30. 0.9 95. ] [ 40. 3.7 96. ] [ 50. 1.3 97. ] [ 17. 2.1 100. ] [ 60. 0.8 104. ] [ 30. 0.7 104. ] [ 35. 3.4 105. ] [ 60. 0.7 106. ] [ 45. 6.1 107. ] [ 40. 1.18 107. ] [ 60. 1.3 107. ] [ 35. 1.18 107. ] [ 40. 1.18 107. ] [ 60. 0.9 107. ] [ 25. 2.1 108. ] [ 35. 1. 108. ] [ 30. 0.8 108. ] [ 38. 1.1 109. ] [ 35. 0.9 109. ] [ 30. 0.9 109. ] [ 40. 0.9 110. ] [ 25. 1.7 111. ] [ 30. 0.7 112. ] [ 30. 0.7 112. ] [ 60. 1. 113. ] [ 30. 1.83 113. ] [ 35. 0.9 115. ] [ 45. 2.5 115. ] [ 60. 0.9 117. ] [ 45. 0.9 118. ] [ 35. 1.18 119. ] [ 35. 0.8 120. ] [ 25. 1.7 120. ] [ 35. 1.4 120. ] [ 25. 1. 120. ] [ 50. 1.3 121. ] [ 45. 1.1 121. ] [ 40. 1.2 121. ] [ 35. 0.8 121. ] [ 40. 0.9 123. ] [ 35. 0.9 126. ] [ 30. 1.1 129. ] [ 38. 1.3 130. ] [ 60. 0.7 134. ] [ 20. 2.4 135. ] [ 40. 1. 140. ] [ 35. 0.8 145. ] [ 35. 1.5 145. ] [ 40. 0.9 146. ] [ 60. 1.1 146. ] [ 20. 0.8 146. ] [ 35. 0.9 146. ] [ 60. 1. 146. ] [ 40. 1. 147. ] [ 50. 1. 147. ] [ 60. 1.2 147. ] [ 40. 0.7 147. ] [ 30. 0.9 148. ] [ 25. 1. 150. ] [ 25. 1.2 154. ] [ 38. 2.5 162. ] [ 25. 1.2 170. ] [ 30. 1.5 171. ] [ 50. 0.6 172. ] [ 25. 2.1 172. ] [ 40. 1. 172. ] [ 45. 0.9 174. ] [ 35. 2.1 174. ] [ 60. 1.5 174. ] [ 40. 0.7 175. ] [ 30. 1.18 180. ] [ 20. 1.6 180. ] [ 45. 1.8 180. ] [ 38. 1.18 185. ] [ 30. 0.8 186. ] [ 20. 1. 186. ] [ 35. 1.8 186. ] [ 45. 0.7 186. ] [ 60. 1. 186. ] [ 60. 0.9 186. ] [ 25. 3.5 187. ] [ 40. 0.7 187. ] [ 45. 1. 187. ] [ 40. 0.8 187. ] [ 38. 0.9 187. ] [ 40. 1. 187. ] [ 35. 0.8 187. ] [ 17. 1. 188. ] [ 62. 0.8 192. ] [ 50. 1.4 192. ] [ 30. 1.6 193. ] [ 35. 0.8 194. ] [ 35. 1.3 195. ] [ 50. 0.9 196. ] [ 35. 1.1 197. ] [ 35. 0.7 197. ] [ 20. 1.83 198. ] [ 50. 1.1 200. ] [ 35. 1.1 201. ] [ 25. 0.8 201. ] [ 25. 1. 205. ] [ 60. 1.4 205. ] [ 25. 1.3 205. ] [ 35. 1. 206. ] [ 25. 5. 207. ] [ 25. 1.2 207. ] [ 30. 1.7 207. ] [ 35. 1.1 208. ] [ 35. 0.9 209. ] [ 38. 1.4 209. ] [ 45. 1.1 209. ] [ 50. 1.1 209. ] [ 50. 1.1 209. ] [ 30. 1.2 210. ] [ 40. 1. 210. ] [ 45. 1.18 211. ] [ 35. 1.3 212. ] [ 30. 1.3 212. ] [ 35. 1.1 212. ] [ 40. 0.9 213. ] [ 38. 1.8 213. ] [ 38. 1.4 213. ] [ 25. 1.1 214. ] [ 25. 2.4 214. ] [ 35. 1. 214. ] [ 40. 1.2 214. ] [ 30. 0.5 214. ] [ 35. 0.8 215. ] [ 45. 1. 215. ] [ 35. 1.2 215. ] [ 60. 1. 215. ] [ 30. 1. 216. ] [ 38. 1.7 220. ] [ 38. 1. 230. ] [ 25. 0.8 230. ] [ 50. 0.7 231. ] [ 40. 1. 233. ] [ 40. 0.7 233. ] [ 25. 1.4 235. ] [ 60. 1. 237. ] [ 38. 1.2 237. ] [ 35. 0.9 240. ] [ 20. 1.83 241. ] [ 38. 1.7 244. ] [ 38. 0.9 244. ] [ 35. 1. 244. ] [ 30. 1.6 244. ] [ 40. 0.9 244. ] [ 38. 1.2 245. ] [ 40. 0.7 245. ] [ 30. 1. 245. ] [ 38. 0.8 245. ] [ 35. 1.1 245. ] [ 38. 1.1 246. ] [ 30. 0.7 246. ] [ 38. 1.3 246. ] [ 40. 1. 247. ] [ 40. 2.7 250. ] [ 30. 3.8 250. ] [ 38. 1.1 250. ] [ 40. 0.8 250. ] [ 40. 1.2 250. ] [ 35. 1.7 250. ] [ 55. 1. 250. ] [ 35. 1.1 256. ] [ 38. 0.9 256. ] [ 55. 0.8 257. ] [ 35. 1.4 258. ] [ 38. 1. 258. ] [ 35. 0.9 270. ] [ 38. 1.1 270. ] [ 38. 1.2 271. ] [ 60. 0.8 278. ] [ 38. 1.4 280. ] [ 45. 1.6 285. ]]
print(y)
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 1 1 1 0 1 1 1 1 1 1 0 0 1 0 1 1 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 0 0 1 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
# Splitting the dataset into training set and test set
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state =0)
#80% train, 20% test
x_train.shape, x_test.shape, y_train.shape, y_test.shape
((237, 3), (60, 3), (237,), (60,))
print(x_train)
[[ 40. 0.8 250. ] [ 25. 2.5 77. ] [ 35. 1.3 88. ] [ 60. 1. 113. ] [ 35. 0.7 197. ] [ 25. 1.3 16. ] [ 25. 0.9 38. ] [ 35. 0.8 120. ] [ 60. 1.4 205. ] [ 35. 1.1 208. ] [ 45. 1.3 26. ] [ 20. 1.83 198. ] [ 25. 0.8 230. ] [ 50. 1. 30. ] [ 25. 1.7 82. ] [ 40. 1.2 121. ] [ 45. 1. 80. ] [ 40. 0.9 244. ] [ 50. 1.3 121. ] [ 35. 1.8 186. ] [ 30. 1.3 212. ] [ 50. 1.1 209. ] [ 40. 1.2 74. ] [ 25. 1. 205. ] [ 40. 1. 140. ] [ 35. 1. 206. ] [ 25. 1.2 85. ] [ 60. 1.1 33. ] [ 35. 1.3 212. ] [ 25. 1.2 154. ] [ 35. 0.8 145. ] [ 40. 1. 147. ] [ 38. 2.2 45. ] [ 60. 0.9 107. ] [ 25. 1. 15. ] [ 45. 0.9 174. ] [ 25. 2.4 214. ] [ 60. 3.2 94. ] [ 38. 1.83 95. ] [ 35. 1. 72. ] [ 35. 1. 108. ] [ 35. 1.1 212. ] [ 60. 0.9 186. ] [ 55. 1. 250. ] [ 62. 0.8 192. ] [ 40. 0.7 233. ] [ 25. 1. 120. ] [ 30. 1.2 29. ] [ 25. 1. 78. ] [ 60. 0.7 106. ] [ 25. 1. 60. ] [ 35. 1.18 119. ] [ 55. 0.8 257. ] [ 35. 1.2 90. ] [ 30. 1.6 20. ] [ 30. 1.2 26. ] [ 40. 1. 187. ] [ 38. 0.9 14. ] [ 35. 2.7 54. ] [ 60. 0.8 74. ] [ 50. 1.3 97. ] [ 30. 0.7 112. ] [ 45. 1.1 121. ] [ 60. 1. 215. ] [ 60. 1.2 90. ] [ 35. 0.9 146. ] [ 20. 2.7 8. ] [ 60. 1.1 85. ] [ 30. 1. 245. ] [ 40. 2.3 60. ] [ 25. 1.2 66. ] [ 40. 1.18 107. ] [ 45. 1.8 180. ] [ 20. 2.4 135. ] [ 30. 0.7 246. ] [ 20. 1.83 31. ] [ 38. 2.5 162. ] [ 38. 1.1 11. ] [ 38. 1.1 88. ] [ 40. 1.2 250. ] [ 20. 1.9 7. ] [ 25. 1.7 120. ] [ 17. 2.1 100. ] [ 30. 1.83 23. ] [ 38. 1.83 27. ] [ 38. 0.6 74. ] [ 35. 1.1 245. ] [ 50. 0.7 231. ] [ 55. 1.9 15. ] [ 35. 0.8 215. ] [ 25. 5. 207. ] [ 35. 0.9 240. ] [ 50. 1.18 76. ] [ 20. 1.4 43. ] [ 20. 1.3 7. ] [ 35. 0.9 209. ] [ 30. 1.7 88. ] [ 50. 1. 147. ] [ 60. 0.8 79. ] [ 38. 4. 10. ] [ 35. 2.1 174. ] [ 20. 1.1 55. ] [ 30. 1.2 32. ] [ 30. 1. 12. ] [ 35. 1.4 120. ] [ 25. 1. 43. ] [ 38. 0.9 187. ] [ 30. 1.7 207. ] [ 40. 3.7 96. ] [ 50. 0.9 196. ] [ 35. 1.1 60. ] [ 30. 1.5 171. ] [ 25. 0.8 201. ] [ 45. 6.1 107. ] [ 35. 1.4 258. ] [ 17. 1. 188. ] [ 50. 1. 33. ] [ 40. 1. 91. ] [ 30. 0.8 108. ] [ 35. 0.9 270. ] [ 35. 0.8 194. ] [ 45. 2.5 115. ] [ 60. 1.3 90. ] [ 38. 0.9 256. ] [ 40. 1.1 86. ] [ 30. 1.6 193. ] [ 38. 0.7 83. ] [ 60. 1. 186. ] [ 35. 0.9 126. ] [ 50. 1. 30. ] [ 40. 0.7 147. ] [ 40. 1.2 91. ] [ 20. 1.9 4. ] [ 60. 1. 83. ] [ 25. 1.3 83. ] [ 38. 1.2 271. ] [ 40. 1. 233. ] [ 30. 1.2 210. ] [ 40. 0.8 68. ] [ 30. 1. 42. ] [ 20. 4.4 41. ] [ 35. 1.1 79. ] [ 60. 0.8 278. ] [ 30. 0.9 109. ] [ 60. 0.8 22. ] [ 40. 0.9 213. ] [ 35. 0.9 115. ] [ 25. 1.7 111. ] [ 45. 0.9 74. ] [ 25. 1. 150. ] [ 45. 1.18 87. ] [ 45. 1. 187. ] [ 40. 1.18 107. ] [ 20. 1.83 241. ] [ 38. 1.4 280. ] [ 30. 0.8 186. ] [ 15. 1.2 10. ] [ 25. 1.83 67. ] [ 40. 0.7 187. ] [ 50. 0.7 79. ] [ 40. 0.9 146. ] [ 38. 0.75 95. ] [ 35. 0.9 109. ] [ 38. 1.1 270. ] [ 40. 0.7 245. ] [ 40. 0.9 110. ] [ 30. 1. 216. ] [ 60. 1.18 82. ] [ 50. 1.8 78. ] [ 30. 3.8 250. ] [ 25. 0.9 10. ] [ 60. 1.5 95. ] [ 30. 1. 87. ] [ 35. 3.5 30. ] [ 40. 0.6 54. ] [ 35. 1.5 145. ] [ 38. 1.4 209. ] [ 15. 1.3 65. ] [ 38. 1.1 6. ] [ 38. 1. 95. ] [ 38. 1.2 237. ] [ 25. 2.1 172. ] [ 45. 1.2 33. ] [ 45. 1.3 88. ] [ 60. 1.3 107. ] [ 40. 1.7 75. ] [ 14. 0.8 14. ] [ 25. 1.4 235. ] [ 45. 1.1 209. ] [ 30. 2.3 30. ] [ 35. 1.18 107. ] [ 50. 1. 44. ] [ 38. 1.3 130. ] [ 38. 1.7 220. ] [ 35. 3.4 105. ] [ 50. 1. 29. ] [ 38. 5.8 26. ] [ 25. 1.2 170. ] [ 30. 1.1 129. ] [ 45. 0.9 118. ] [ 25. 1.1 214. ] [ 25. 3.5 187. ] [ 45. 3. 28. ] [ 35. 1. 28. ] [ 30. 0.7 104. ] [ 50. 0.6 172. ] [ 38. 1.1 246. ] [ 30. 1.6 244. ] [ 30. 1.83 113. ] [ 38. 0.8 245. ] [ 60. 1.2 147. ] [ 25. 1.1 87. ] [ 20. 1. 186. ] [ 38. 1.4 213. ] [ 60. 0.7 94. ] [ 38. 1.7 244. ] [ 50. 1.3 72. ] [ 38. 1.9 23. ] [ 60. 0.7 134. ] [ 35. 1.7 250. ] [ 60. 1. 146. ] [ 35. 1.1 256. ] [ 38. 3. 30. ] [ 20. 1.6 180. ] [ 25. 1.1 79. ] [ 35. 0.9 71. ] [ 40. 0.7 79. ] [ 38. 1.8 213. ] [ 38. 1.3 246. ] [ 50. 1.4 192. ] [ 35. 9.4 10. ] [ 38. 1.18 185. ] [ 35. 1.2 215. ] [ 30. 1.18 180. ] [ 60. 0.9 94. ] [ 38. 0.6 40. ] [ 20. 0.8 146. ]]
print(y_test)
[0 0 1 0 1 1 0 0 0 0 1 1 0 0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 1 1 0 0 0 1 0]
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)
Model Building
Logistic Regression
# Logistic Regression
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(x_train, y_train)
LogisticRegression()
# Predicting the test set
y_pred = lr.predict(x_test)
# Making Confusion Matrix and calculating accuracy score
mylist = []
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
ac = accuracy_score(y_test, y_pred)
mylist.append(ac)
print(cm)
print(ac)
[[40 3] [ 4 13]] 0.8833333333333333
lr_conf_matrix = confusion_matrix(y_test, y_pred)
lr_conf_matrix
array([[40, 3],
[ 4, 13]], dtype=int64)
sns.heatmap(lr_conf_matrix, annot=True)
plt.xlabel('predicted')
plt.ylabel('Truth')
Text(70.0, 0.5, 'Truth')
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print(f'True Neg: {tn}')
print(f'False Pos: {fp}')
print(f'False Neg: {fn}')
print(f'True Pos: {tp}')
True Neg: 40 False Pos: 3 False Neg: 4 True Pos: 13
lr_classification_report = classification_report(y_test, y_pred)
print(lr_classification_report)
precision recall f1-score support
0 0.91 0.93 0.92 43
1 0.81 0.76 0.79 17
accuracy 0.88 60
macro avg 0.86 0.85 0.85 60
weighted avg 0.88 0.88 0.88 60
SVM
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score
list1 = []
for c in [0.5,0.6,0.7,0.8,0.9,1.0]:
classifier = SVC(C = c, random_state=0, kernel = 'rbf')
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)
list1.append(accuracy_score(y_test,y_pred))
plt.plot([0.5,0.6,0.7,0.8,0.9,1.0], list1)
plt.show()
# Training the Support Vector Classifier on the Training set
from sklearn.svm import SVC
classifier = SVC(C = 0.6, random_state=0, kernel = 'rbf')
classifier.fit(x_train, y_train)
SVC(C=0.6, random_state=0)
# Predicting the test set results
y_pred = classifier.predict(x_test)
print(y_pred)
[1 0 1 0 0 1 0 0 0 0 1 1 0 0 1 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0 0 0 1 1 0 0 1 1 0 0 0 1 0]
# Making the confusion matrix and calculating accuracy score
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
ac = accuracy_score(y_test, y_pred)
print(cm)
print(ac)
mylist.append(ac)
[[40 3] [ 3 14]] 0.9
svm_conf_matrix = confusion_matrix(y_test, y_pred)
svm_conf_matrix
array([[40, 3],
[ 3, 14]], dtype=int64)
sns.heatmap(svm_conf_matrix, annot=True)
plt.xlabel('predicted')
plt.ylabel('Truth')
Text(70.0, 0.5, 'Truth')
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print(f'True Neg: {tn}')
print(f'False Pos: {fp}')
print(f'False Neg: {fn}')
print(f'True Pos: {tp}')
True Neg: 40 False Pos: 3 False Neg: 3 True Pos: 14
svm_classification_report = classification_report(y_test, y_pred)
print(svm_classification_report)
precision recall f1-score support
0 0.93 0.93 0.93 43
1 0.82 0.82 0.82 17
accuracy 0.90 60
macro avg 0.88 0.88 0.88 60
weighted avg 0.90 0.90 0.90 60
Random Forest
# Training the RandomForest Classifier on the Training set
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators = 11, criterion='entropy', random_state=0)
rf.fit(x_train,y_train)
RandomForestClassifier(criterion='entropy', n_estimators=11, random_state=0)
# Predicting the test set results
y_pred = rf.predict(x_test)
print(y_pred)
[1 0 1 0 0 1 0 0 0 0 1 1 0 0 1 0 0 0 0 1 1 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 1 1 0 0 0 1 0]
# Making the confusion matrix and calculating the accuracy score
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
ac = accuracy_score(y_test, y_pred)
mylist.append(ac)
print(cm)
print(ac)
[[41 2] [ 1 16]] 0.95
rf_conf_matrix = confusion_matrix(y_test, y_pred)
rf_conf_matrix
array([[41, 2],
[ 1, 16]], dtype=int64)
sns.heatmap(rf_conf_matrix, annot=True)
plt.xlabel('predicted')
plt.ylabel('Truth')
Text(70.0, 0.5, 'Truth')
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print(f'True Neg: {tn}')
print(f'False Pos: {fp}')
print(f'False Neg: {fn}')
print(f'True Pos: {tp}')
True Neg: 41 False Pos: 2 False Neg: 1 True Pos: 16
rf_classification_report = classification_report(y_test, y_pred)
print(rf_classification_report)
precision recall f1-score support
0 0.98 0.95 0.96 43
1 0.89 0.94 0.91 17
accuracy 0.95 60
macro avg 0.93 0.95 0.94 60
weighted avg 0.95 0.95 0.95 60
Naive Bayes
#Naive Bayes
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb.fit(x_train, y_train)
GaussianNB()
y_pred = nb.predict(x_test)
print(y_pred)
[1 0 1 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 1 0]
# Making the confusion matrix and calculating the accuracy score
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
ac = accuracy_score(y_test, y_pred)
mylist.append(ac)
print(cm)
print(ac)
[[41 2] [ 6 11]] 0.8666666666666667
nb_conf_matrix = confusion_matrix(y_test, y_pred)
nb_conf_matrix
array([[41, 2],
[ 6, 11]], dtype=int64)
sns.heatmap(nb_conf_matrix, annot=True)
plt.xlabel('predicted')
plt.ylabel('Truth')
Text(70.0, 0.5, 'Truth')
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print(f'True Neg: {tn}')
print(f'False Pos: {fp}')
print(f'False Neg: {fn}')
print(f'True Pos: {tp}')
True Neg: 41 False Pos: 2 False Neg: 6 True Pos: 11
nb_classification_report = classification_report(y_test, y_pred)
print(nb_classification_report)
precision recall f1-score support
0 0.87 0.95 0.91 43
1 0.85 0.65 0.73 17
accuracy 0.87 60
macro avg 0.86 0.80 0.82 60
weighted avg 0.86 0.87 0.86 60
Comparing Model Accuracy
# Plotting accuracy score of different models
mylist
[0.8833333333333333, 0.9, 0.95, 0.8666666666666667]
mylist2 = ["Logistic Regression", "SupportVector","RandomForest", 'Naive Bayes']
plt.rcParams['figure.figsize']=15,6
sns.set_style("darkgrid")
ax = sns.barplot(x=mylist2, y=mylist, palette = "rocket", saturation =1.5)
plt.xlabel("ML Classifier Models", fontsize = 20 )
plt.ylabel("% of Accuracy", fontsize = 20)
plt.title("Accuracy of different Classifier Models", fontsize = 20)
plt.xticks(fontsize = 12, horizontalalignment = 'center', rotation = 8)
plt.yticks(fontsize = 13)
for p in ax.patches:
width, height = p.get_width(), p.get_height()
x, y = p.get_xy()
ax.annotate(f'{height:.2%}', (x + width/2, y + height*1.02), ha='center', fontsize = 'x-large')
plt.show()